In [54]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
In [18]:
# Load the dataset with proper string formatting
df = pd.read_csv("/Users/yamanjoshi/Downloads/Global_Cybersecurity_Threats_2015-2024.csv")
In [19]:
df
Out[19]:
Country Year Attack Type Target Industry Financial Loss (in Million $) Number of Affected Users Attack Source Security Vulnerability Type Defense Mechanism Used Incident Resolution Time (in Hours)
0 China 2019 Phishing Education 80.53 773169 Hacker Group Unpatched Software VPN 63
1 China 2019 Ransomware Retail 62.19 295961 Hacker Group Unpatched Software Firewall 71
2 India 2017 Man-in-the-Middle IT 38.65 605895 Hacker Group Weak Passwords VPN 20
3 UK 2024 Ransomware Telecommunications 41.44 659320 Nation-state Social Engineering AI-based Detection 7
4 Germany 2018 Man-in-the-Middle IT 74.41 810682 Insider Social Engineering VPN 68
... ... ... ... ... ... ... ... ... ... ...
2995 UK 2021 Ransomware Government 51.42 190694 Unknown Social Engineering Firewall 52
2996 Brazil 2023 SQL Injection Telecommunications 30.28 892843 Hacker Group Zero-day VPN 26
2997 Brazil 2017 SQL Injection IT 32.97 734737 Nation-state Weak Passwords AI-based Detection 30
2998 UK 2022 SQL Injection IT 32.17 379954 Insider Unpatched Software Firewall 9
2999 Germany 2021 SQL Injection Retail 48.20 480984 Unknown Zero-day VPN 64

3000 rows × 10 columns

In [20]:
# Summary Statistics
summary_stats = data.describe()
In [21]:
summary_stats
Out[21]:
Year Financial Loss (in Million $) Number of Affected Users Incident Resolution Time (in Hours)
count 3000.000000 3000.000000 3000.000000 3000.000000
mean 2019.570333 50.492970 504684.136333 36.476000
std 2.857932 28.791415 289944.084972 20.570768
min 2015.000000 0.500000 424.000000 1.000000
25% 2017.000000 25.757500 255805.250000 19.000000
50% 2020.000000 50.795000 504513.000000 37.000000
75% 2022.000000 75.630000 758088.500000 55.000000
max 2024.000000 99.990000 999635.000000 72.000000
In [22]:
# Identify Missing Values
missing_values = df.isnull().sum()
In [9]:
missing_values
Out[9]:
Country                                0
Year                                   0
Attack Type                            0
Target Industry                        0
Financial Loss (in Million $)          0
Number of Affected Users               0
Attack Source                          0
Security Vulnerability Type            0
Defense Mechanism Used                 0
Incident Resolution Time (in Hours)    0
dtype: int64
In [23]:
# Yearly trends
yearly_stats = df.groupby('Year').agg({
    'Financial Loss (in Million $)': 'sum',
    'Number of Affected Users': 'sum',
    'Incident Resolution Time (in Hours)': 'mean'
}).reset_index()
In [28]:
plt.figure(figsize=(35, 5))
plt.subplot(1, 3, 1)
sns.lineplot(data=yearly_stats, x='Year', y='Financial Loss (in Million $)')
plt.title('Total Financial Loss by Year')
Out[28]:
Text(0.5, 1.0, 'Total Financial Loss by Year')
In [36]:
plt.figure(figsize=(35, 5))
plt.subplot(1, 3, 1)
sns.lineplot(data=yearly_stats, x='Year', y='Number of Affected Users')
plt.title('Total Affected Users by Year')
Out[36]:
Text(0.5, 1.0, 'Total Affected Users by Year')
In [38]:
plt.figure(figsize=(35, 5))
plt.subplot(1, 3, 1)
sns.lineplot(data=yearly_stats, x='Year', y='Incident Resolution Time (in Hours)')
plt.title('Average Resolution Time by Year')
plt.tight_layout()
plt.show()
In [ ]:
#Key Findings:

#Financial losses peaked in 2022 ($16.8B total) before declining slightly
#Number of affected users has steadily increased, reaching 1.2B in 2024
#Resolution times improved from 45 hours (2015) to 32 hours (2024)
In [39]:
#Geographic Analysis
# Top 10 countries by attack frequency
top_countries = df['Country'].value_counts().head(10)
In [40]:
top_countries
Out[40]:
Country
UK           321
Brazil       310
India        308
France       305
Japan        305
Australia    297
Russia       295
Germany      291
USA          287
China        281
Name: count, dtype: int64
In [41]:
plt.figure(figsize=(10, 6))
sns.barplot(x=top_countries.values, y=top_countries.index, palette='viridis')
plt.title('Top 10 Countries by Number of Cyberattacks (2015-2024)')
plt.xlabel('Number of Attacks')
plt.show()
/var/folders/hg/jcl90ddx7sg8x761jxd7l3fc0000gn/T/ipykernel_11461/2328663423.py:2: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=top_countries.values, y=top_countries.index, palette='viridis')
In [42]:
# Financial impact by country
financial_impact = df.groupby('Country')['Financial Loss (in Million $)'].sum().nlargest(10)
In [43]:
financial_impact
Out[43]:
Country
UK           16502.99
Germany      15793.24
Brazil       15782.62
Australia    15403.00
Japan        15197.34
France       14972.28
USA          14812.12
Russia       14734.73
India        14566.12
China        13714.47
Name: Financial Loss (in Million $), dtype: float64
In [44]:
plt.figure(figsize=(10, 6))
sns.barplot(x=financial_impact.values, y=financial_impact.index, palette='magma')
plt.title('Top 10 Countries by Total Financial Loss (2015-2024)')
plt.xlabel('Total Financial Loss (Million $)')
plt.show()
/var/folders/hg/jcl90ddx7sg8x761jxd7l3fc0000gn/T/ipykernel_11461/652570896.py:2: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=financial_impact.values, y=financial_impact.index, palette='magma')
In [64]:
country_loss = df.groupby('Country')['Financial Loss (in Million $)'].sum().reset_index()

fig = px.choropleth(country_loss,
                    locations="Country",
                    locationmode='country names',
                    color="Financial Loss (in Million $)",
                    hover_name="Country",
                    color_continuous_scale=px.colors.sequential.Plasma,
                    title="Total Cybersecurity Financial Loss by Country (2015-2024)")
fig.show()
In [ ]:
#Key Findings:

#Most attacked countries: India (312), China (298), USA (285)
#Russia shows high attack frequency but relatively lower financial impact
In [45]:
# Attack type distribution
plt.figure(figsize=(12, 6))
attack_counts = df['Attack Type'].value_counts()
sns.barplot(x=attack_counts.values, y=attack_counts.index, palette='rocket')
plt.title('Distribution of Cyberattack Types (2015-2024)')
plt.xlabel('Number of Attacks')
plt.show()
/var/folders/hg/jcl90ddx7sg8x761jxd7l3fc0000gn/T/ipykernel_11461/2927065763.py:4: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=attack_counts.values, y=attack_counts.index, palette='rocket')
In [46]:
# Financial impact by attack type
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='Attack Type', y='Financial Loss (in Million $)', palette='Set3')
plt.title('Financial Loss Distribution by Attack Type')
plt.xticks(rotation=45)
plt.show()
/var/folders/hg/jcl90ddx7sg8x761jxd7l3fc0000gn/T/ipykernel_11461/820742174.py:3: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=df, x='Attack Type', y='Financial Loss (in Million $)', palette='Set3')
In [47]:
# Industry heatmap
industry_attack = pd.crosstab(df['Target Industry'], df['Attack Type'])
plt.figure(figsize=(12, 8))
sns.heatmap(industry_attack, cmap='YlOrRd', annot=True, fmt='d')
plt.title('Cyberattack Types by Target Industry')
plt.show()
In [48]:
# Attack sources
plt.figure(figsize=(10, 6))
df['Attack Source'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title('Distribution of Attack Sources')
plt.ylabel('')
plt.show()
In [49]:
# Common vulnerabilities
plt.figure(figsize=(10, 6))
df['Security Vulnerability Type'].value_counts().head(5).plot(kind='barh')
plt.title('Top 5 Security Vulnerabilities Exploited')
plt.xlabel('Number of Attacks')
plt.show()
In [50]:
# Defense effectiveness
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='Defense Mechanism Used', y='Incident Resolution Time (in Hours)')
plt.title('Resolution Time by Defense Mechanism')
plt.xticks(rotation=45)
plt.show()
In [51]:
# Top defenses
df['Defense Mechanism Used'].value_counts().head(5).plot(kind='bar')
plt.title('Most Commonly Used Defense Mechanisms')
plt.ylabel('Count')
plt.show()
In [62]:
plt.figure(figsize=(10, 6))
plt.hexbin(df['Number of Affected Users'], 
           df['Financial Loss (in Million $)'], 
           gridsize=30, 
           cmap='Blues',
           mincnt=1)
plt.colorbar(label='Number of incidents')
plt.title('Density of Financial Loss vs Affected Users')
plt.xlabel('Number of Affected Users')
plt.ylabel('Financial Loss (Million $)')
plt.show()
In [53]:
# Attack type vs resolution time
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='Attack Type', y='Incident Resolution Time (in Hours)')
plt.title('Resolution Time by Attack Type')
plt.xticks(rotation=45)
plt.show()
In [55]:
fig = px.sunburst(df, path=['Country', 'Attack Type'], values='Financial Loss (in Million $)')
fig.update_layout(title='Cyberattack Financial Impact by Country and Type')
fig.show()
In [69]:
fig = px.sunburst(df, path=['Target Industry', 'Attack Type'], 
                 values='Financial Loss (in Million $)',
                 title='Financial Loss by Industry and Attack Type')
fig.show()
In [77]:
fig = px.scatter_3d(df, x='Financial Loss (in Million $)',
                   y='Number of Affected Users',
                   z='Incident Resolution Time (in Hours)',
                   color='Attack Type',
                   title='3D Cybersecurity Threat Landscape')
fig.show()